library(rworldmap)
library(plyr)
library(dplyr)
library(purrr)
library(magrittr)
library(ggplot2)
library(GGally)
library(devtools)
library(viridis)

load_all()
## Loading sos
data(country_data)
country_data <- country_data[!is.na(country_data$systems_per_cap), ]
country_data <- na.omit(country_data)

data_map <- joinCountryData2Map(dF = country_data, joinCode = "ISO3", nameJoinColumn = "iso3", verbose = FALSE)
## 163 codes from your data successfully matched countries in the map
## 0 codes from your data failed to match with a country code in the map
## 82 codes from the map weren't represented in your data

Maps and descriptions of variables

I merged various country-level datasets to get a data frame with the following columns.

3-letter country-code (iso3)

[A map doesn’t make sense for this variable.]

DALYs lost to all causes (daly_all)

spplot(data_map, zcol = "daly_all",
       col.regions = viridis(100),
       main = "Dalys Lost To All Causes")

DALYs lost to communicable diseases (daly_comm)

spplot(data_map, zcol = "daly_comm",
       col.regions = viridis(100),
       main = "Dalys Lost To Communicable Diseases")

deaths from all causes (death_all)

spplot(data_map, zcol = "death_all",
       col.regions = viridis(100),
       main = "Deaths From All Causes")

deaths from communicable diseases (death_comm)

spplot(data_map, zcol = "death_comm",
       col.regions = viridis(100),
       main = "Deaths From Communicable Diseases")

number of different diseases recorded in that country, per GIDEON (diseases)

spplot(data_map, zcol = "diseases",
       col.regions = viridis(100),
       main = "Number Of Different Diseases Recorded In That Country, Per Gideon")

GDP per capita (gdp_per_cap)

spplot(data_map, zcol = "gdp_per_cap",
       col.regions = viridis(100),
       main = "Gdp Per Capita")

total health expenditure per capita, in USD (health_exp_tot_usd)

spplot(data_map, zcol = "health_exp_tot_usd",
       col.regions = viridis(100),
       main = "Total Health Expenditure Per Capita, In Usd")

government health expenditure per capita, in USD (health_exp_govt_usd)

spplot(data_map, zcol = "health_exp_govt_usd",
       col.regions = viridis(100),
       main = "Government Health Expenditure Per Capita, In Usd")

population (population)

spplot(data_map, zcol = "population",
       col.regions = viridis(100),
       main = "Population")

number of surveillance systems (systems)

spplot(data_map, zcol = "systems",
       col.regions = viridis(100),
       main = "Number Of Surveillance Systems")

surveillance systems per capita (systems_per_cap)

spplot(data_map, zcol = "systems_per_cap",
       col.regions = viridis(100),
       main = "Surveillance Systems Per Capita")

Univariate Models

ggpairs(country_data[-1])

daly_all

qplot(daly_all, systems_per_cap, data = country_data) + geom_smooth(method = "lm") 

m <- lm(systems_per_cap ~ daly_all, data = country_data, weights = systems)
summary(m)
## 
## Call:
## lm(formula = systems_per_cap ~ daly_all, data = country_data, 
##     weights = systems)
## 
## Weighted Residuals:
##        Min         1Q     Median         3Q        Max 
## -7.298e-05 -1.306e-05 -8.070e-06  1.300e-06  3.493e-04 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  8.503e-06  1.694e-06   5.019 1.37e-06 ***
## daly_all    -1.070e-10  4.685e-11  -2.284   0.0237 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.204e-05 on 161 degrees of freedom
## Multiple R-squared:  0.03138,    Adjusted R-squared:  0.02536 
## F-statistic: 5.216 on 1 and 161 DF,  p-value: 0.02369

There’s a negative correlation between the DALY rate and number of systems per capita. However, it isn’t very strong.

daly_comm

qplot(daly_comm, systems_per_cap, data = country_data) + geom_smooth(method = "lm") 

m <- lm(systems_per_cap ~ daly_comm, data = country_data, weights = systems)
summary(m)
## 
## Call:
## lm(formula = systems_per_cap ~ daly_comm, data = country_data, 
##     weights = systems)
## 
## Weighted Residuals:
##        Min         1Q     Median         3Q        Max 
## -7.122e-05 -1.389e-05 -9.470e-06  1.400e-07  3.517e-04 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.138e-06  9.910e-07   6.194 4.71e-09 ***
## daly_comm   -1.279e-10  6.533e-11  -1.958   0.0519 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.222e-05 on 161 degrees of freedom
## Multiple R-squared:  0.02326,    Adjusted R-squared:  0.01719 
## F-statistic: 3.834 on 1 and 161 DF,  p-value: 0.05194

For the communicable diseases DALY rate, the same correlation is true, but it’s less strong (“marginally significant”).

death_all

qplot(death_all, systems_per_cap, data = country_data) + geom_smooth(method = "lm") 

m <- lm(systems_per_cap ~ death_all, data = country_data, weights = systems)
summary(m)
## 
## Call:
## lm(formula = systems_per_cap ~ death_all, data = country_data, 
##     weights = systems)
## 
## Weighted Residuals:
##        Min         1Q     Median         3Q        Max 
## -7.391e-05 -1.280e-05 -7.280e-06  1.030e-06  3.485e-04 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  8.751e-06  1.742e-06   5.024 1.34e-06 ***
## death_all   -5.354e-09  2.265e-09  -2.364   0.0193 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.2e-05 on 161 degrees of freedom
## Multiple R-squared:  0.03354,    Adjusted R-squared:  0.02753 
## F-statistic: 5.587 on 1 and 161 DF,  p-value: 0.01929

Again, the same correlation is true for death from all causes. It’s a bit stronger here.

death_comm

qplot(death_comm, systems_per_cap, data = country_data) + geom_smooth(method = "lm") 

m <- lm(systems_per_cap ~ death_comm, data = country_data, weights = systems)
summary(m)
## 
## Call:
## lm(formula = systems_per_cap ~ death_comm, data = country_data, 
##     weights = systems)
## 
## Weighted Residuals:
##        Min         1Q     Median         3Q        Max 
## -7.051e-05 -1.398e-05 -9.300e-06 -3.000e-07  3.520e-04 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.097e-06  9.833e-07   6.201 4.54e-09 ***
## death_comm  -6.975e-09  3.599e-09  -1.938   0.0544 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.223e-05 on 161 degrees of freedom
## Multiple R-squared:  0.02279,    Adjusted R-squared:  0.01672 
## F-statistic: 3.755 on 1 and 161 DF,  p-value: 0.05441

Death from communicable diseases looks similar to DALYs from communicable diseases.

ggpairs(country_data[, 2:5])

# If it surprises anybody, these are all very collinear. I'm going to pick only one of them to continue with.

country_data %<>%
  select(-daly_comm, -death_all, -death_comm)

diseases

qplot(diseases, systems_per_cap, data = country_data) + geom_smooth(method = "lm") 

m <- lm(systems_per_cap ~ diseases, data = country_data, weights = systems)
summary(m)
## 
## Call:
## lm(formula = systems_per_cap ~ diseases, data = country_data, 
##     weights = systems)
## 
## Weighted Residuals:
##        Min         1Q     Median         3Q        Max 
## -6.269e-05 -1.682e-05 -9.330e-06  5.800e-07  3.215e-04 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.101e-05  1.147e-05   6.192 4.76e-09 ***
## diseases    -3.132e-07  5.442e-08  -5.756 4.24e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.89e-05 on 161 degrees of freedom
## Multiple R-squared:  0.1707, Adjusted R-squared:  0.1655 
## F-statistic: 33.13 on 1 and 161 DF,  p-value: 4.244e-08

There is a strong negative correlation between number of diseases observed per country in GIDEON and number of surveillance systems in a country. However, the causality here could be different–it could be that we don’t observe diseases because there aren’t many surveillance systems covering the population.

gdp_per_cap

qplot(gdp_per_cap, systems_per_cap, data = country_data) + geom_smooth(method = "lm") 

m <- lm(systems_per_cap ~ gdp_per_cap, data = country_data, weights = systems)
summary(m)
## 
## Call:
## lm(formula = systems_per_cap ~ gdp_per_cap, data = country_data, 
##     weights = systems)
## 
## Weighted Residuals:
##        Min         1Q     Median         3Q        Max 
## -9.378e-05 -8.890e-06 -5.900e-06 -1.270e-06  3.581e-04 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 2.558e-06  1.267e-06   2.019  0.04512 * 
## gdp_per_cap 9.696e-11  3.519e-11   2.755  0.00654 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.175e-05 on 161 degrees of freedom
## Multiple R-squared:  0.04503,    Adjusted R-squared:  0.0391 
## F-statistic: 7.592 on 1 and 161 DF,  p-value: 0.006539

Positive correlation, significant. The more GDP per capita you have, the higher your number of systems per capita. It’s significant all the time, moreso if you weight for population or nothing.

health_exp_tot_usd

qplot(health_exp_tot_usd, systems_per_cap, data = country_data) + geom_smooth(method = "lm") 

m <- lm(systems_per_cap ~ health_exp_tot_usd, data = country_data, weights = systems)
summary(m)
## 
## Call:
## lm(formula = systems_per_cap ~ health_exp_tot_usd, data = country_data, 
##     weights = systems)
## 
## Weighted Residuals:
##        Min         1Q     Median         3Q        Max 
## -7.818e-05 -1.280e-05 -1.036e-05 -5.560e-06  3.569e-04 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        4.623e-06  1.184e-06   3.905 0.000139 ***
## health_exp_tot_usd 2.007e-10  3.036e-10   0.661 0.509478    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.266e-05 on 161 degrees of freedom
## Multiple R-squared:  0.002707,   Adjusted R-squared:  -0.003487 
## F-statistic: 0.4371 on 1 and 161 DF,  p-value: 0.5095

No significant correlation.

health_exp_govt_usd

qplot(health_exp_govt_usd, systems_per_cap, data = country_data) + geom_smooth(method = "lm") 

m <- lm(systems_per_cap ~ health_exp_govt_usd, data = country_data, weights = systems)
summary(m)
## 
## Call:
## lm(formula = systems_per_cap ~ health_exp_govt_usd, data = country_data, 
##     weights = systems)
## 
## Weighted Residuals:
##        Min         1Q     Median         3Q        Max 
## -8.141e-05 -1.144e-05 -9.000e-06 -4.270e-06  3.579e-04 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         4.026e-06  1.195e-06   3.369 0.000944 ***
## health_exp_govt_usd 6.215e-10  4.553e-10   1.365 0.174103    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.247e-05 on 161 degrees of freedom
## Multiple R-squared:  0.01144,    Adjusted R-squared:  0.005303 
## F-statistic: 1.864 on 1 and 161 DF,  p-value: 0.1741

No significant correlation.

qplot(health_exp_tot_usd, health_exp_govt_usd, data = country_data)

# These are really collinear. I'm removing government healthcare spending.

country_data %<>%
  select(-health_exp_govt_usd)

Univariate models, in summary:

  • Negative association: DALY and death rates, number of diseases.
  • Positive association: GDP per capita.
  • No significant associations: healthcare expenditure.

Weirdly, even though GDP per capita and healthcare expenditure per capita are similarly distributed (\(R^2 = 0.82\)), the p-value for the univariate model including GDP is 0.00687, whereas for healthcare expenditure per capita it’s 0.50757.

I removed all but one from the groups of collinear variables (DALY indicators, healthcare expenditure indicators).

A few other thoughts:

  • Some countries are missing, most notably china. This is because I ran na.omit() on the data frame. China must be missing on one of the variables in the data frame, so I’m going to look it up and fix that data merging problem, because it’s an important country to include.
  • A few countries that are high on the outcome measure, outliers (Luxembourg, Malta, Greenland) may be throwing things off and I might omit them.
  • Before I do my final multivariable models, I should sketch out a DAG to crystallize my thoughts on causality. That way, I’ll know which variables I should be excluding from the model a priori.
  • Should the outcome be log(systems_per_capita)?
  • I’m weighting by number of systems per country. My thought is that, this is conceptually because we sampled by number of surveillance systems, not per-country. But we’re running the model at country level, so maybe we shouldn’t weight. The weighting doesn’t change the results that much.

Univariate models and multiple regression model selection

With weights = systems

country_data$systems_per_1000000 <- country_data$systems_per_cap * 1000000


m1 <- glm(systems_per_1000000 ~ gdp_per_cap, data = country_data, weights = systems)
summary(m1)
## 
## Call:
## glm(formula = systems_per_1000000 ~ gdp_per_cap, data = country_data, 
##     weights = systems)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -93.78   -8.89   -5.90   -1.27  358.10  
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 2.558e+00  1.267e+00   2.019  0.04512 * 
## gdp_per_cap 9.696e-05  3.519e-05   2.755  0.00654 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1742.751)
## 
##     Null deviance: 293814  on 162  degrees of freedom
## Residual deviance: 280583  on 161  degrees of freedom
## AIC: 1300.1
## 
## Number of Fisher Scoring iterations: 2
# Significant, of course.

m2 <- glm(systems_per_1000000 ~ diseases, data = country_data, weights = systems)
summary(m2)
## 
## Call:
## glm(formula = systems_per_1000000 ~ diseases, data = country_data, 
##     weights = systems)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -62.69  -16.82   -9.33    0.58  321.51  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 71.00704   11.46752   6.192 4.76e-09 ***
## diseases    -0.31324    0.05442  -5.756 4.24e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1513.491)
## 
##     Null deviance: 293814  on 162  degrees of freedom
## Residual deviance: 243672  on 161  degrees of freedom
## AIC: 1277.1
## 
## Number of Fisher Scoring iterations: 2
# Significant, positive.

m3 <- glm(systems_per_1000000 ~ daly_all, data = country_data, weights = systems)
summary(m3)
## 
## Call:
## glm(formula = systems_per_1000000 ~ daly_all, data = country_data, 
##     weights = systems)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -72.98  -13.06   -8.07    1.30  349.31  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  8.503e+00  1.694e+00   5.019 1.37e-06 ***
## daly_all    -1.070e-04  4.685e-05  -2.284   0.0237 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1767.665)
## 
##     Null deviance: 293814  on 162  degrees of freedom
## Residual deviance: 284594  on 161  degrees of freedom
## AIC: 1302.4
## 
## Number of Fisher Scoring iterations: 2
# Significant, of course.

m4 <- glm(systems_per_1000000 ~ health_exp_tot_usd, data = country_data, weights = systems)
summary(m4)
## 
## Call:
## glm(formula = systems_per_1000000 ~ health_exp_tot_usd, data = country_data, 
##     weights = systems)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -78.18  -12.80  -10.36   -5.56  356.93  
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        4.6228064  1.1839116   3.905 0.000139 ***
## health_exp_tot_usd 0.0002007  0.0003036   0.661 0.509478    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1819.988)
## 
##     Null deviance: 293814  on 162  degrees of freedom
## Residual deviance: 293018  on 161  degrees of freedom
## AIC: 1307.2
## 
## Number of Fisher Scoring iterations: 2
# Not significant

m5 <- glm(systems_per_1000000 ~ gdp_per_cap + diseases , data = country_data, weights = systems)
summary(m5)
## 
## Call:
## glm(formula = systems_per_1000000 ~ gdp_per_cap + diseases, data = country_data, 
##     weights = systems)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -71.62  -13.06   -4.83    4.26  324.79  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.551e+01  1.162e+01   5.639 7.58e-08 ***
## gdp_per_cap  7.111e-05  3.277e-05   2.170   0.0315 *  
## diseases    -2.961e-01  5.438e-02  -5.446 1.91e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1479.411)
## 
##     Null deviance: 293814  on 162  degrees of freedom
## Residual deviance: 236706  on 160  degrees of freedom
## AIC: 1274.4
## 
## Number of Fisher Scoring iterations: 2
# All are still significant.

m6 <- glm(systems_per_1000000 ~ gdp_per_cap * diseases , data = country_data, weights = systems)
summary(m6)
## 
## Call:
## glm(formula = systems_per_1000000 ~ gdp_per_cap * diseases, data = country_data, 
##     weights = systems)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -71.97  -11.65   -5.33    1.65  328.68  
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)  
## (Intercept)           4.418e+01  1.866e+01   2.368   0.0191 *
## gdp_per_cap           7.259e-04  4.503e-04   1.612   0.1090  
## diseases             -1.946e-01  8.824e-02  -2.206   0.0288 *
## gdp_per_cap:diseases -3.145e-06  2.157e-06  -1.458   0.1468  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1469.077)
## 
##     Null deviance: 293814  on 162  degrees of freedom
## Residual deviance: 233583  on 159  degrees of freedom
## AIC: 1274.2
## 
## Number of Fisher Scoring iterations: 2
# No, this messes things up. We remove the interaction term.

m7 <- glm(systems_per_1000000 ~ gdp_per_cap + diseases + daly_all , data = country_data, weights = systems)
summary(m7)
## 
## Call:
## glm(formula = systems_per_1000000 ~ gdp_per_cap + diseases + 
##     daly_all, data = country_data, weights = systems)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -71.31  -13.49   -5.17    3.65  325.54  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.557e+01  1.165e+01   5.627 8.10e-08 ***
## gdp_per_cap  7.773e-05  4.034e-05   1.927   0.0558 .  
## diseases    -2.996e-01  5.586e-02  -5.363 2.84e-07 ***
## daly_all     1.546e-05  5.462e-05   0.283   0.7775    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1487.965)
## 
##     Null deviance: 293814  on 162  degrees of freedom
## Residual deviance: 236586  on 159  degrees of freedom
## AIC: 1276.3
## 
## Number of Fisher Scoring iterations: 2
# No, we won't keep this. It isn't significant, and it bumps gdp_per_cap up above 0.05.

m8 <- glm(systems_per_1000000 ~ gdp_per_cap + diseases + health_exp_tot_usd , data = country_data, weights = systems)
summary(m8)
## 
## Call:
## glm(formula = systems_per_1000000 ~ gdp_per_cap + diseases + 
##     health_exp_tot_usd, data = country_data, weights = systems)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -73.85  -12.52   -4.68    3.21  328.85  
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         5.512e+01  1.485e+01   3.711 0.000285 ***
## gdp_per_cap         1.663e-04  9.098e-05   1.827 0.069517 .  
## diseases           -2.480e-01  6.928e-02  -3.579 0.000458 ***
## health_exp_tot_usd -8.617e-04  7.687e-04  -1.121 0.263992    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1477.042)
## 
##     Null deviance: 293814  on 162  degrees of freedom
## Residual deviance: 234850  on 159  degrees of freedom
## AIC: 1275.1
## 
## Number of Fisher Scoring iterations: 2
# And, same.

# So for this section, our winning model is:
m9 <- glm(systems_per_1000000 ~ gdp_per_cap + diseases , data = country_data, weights = systems)
summary(m9)
## 
## Call:
## glm(formula = systems_per_1000000 ~ gdp_per_cap + diseases, data = country_data, 
##     weights = systems)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -71.62  -13.06   -4.83    4.26  324.79  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.551e+01  1.162e+01   5.639 7.58e-08 ***
## gdp_per_cap  7.111e-05  3.277e-05   2.170   0.0315 *  
## diseases    -2.961e-01  5.438e-02  -5.446 1.91e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1479.411)
## 
##     Null deviance: 293814  on 162  degrees of freedom
## Residual deviance: 236706  on 160  degrees of freedom
## AIC: 1274.4
## 
## Number of Fisher Scoring iterations: 2
# All are still significant.


models <- list(m1, m2, m3, m4, m5, m6, m7, m8, m9)


print_summary <- function(x) {
  cat("\n")
  print(x$formula)
  # cat("\n")
  print(knitr::kable(summary(x)$coef))
  cat("\n")
}
models %>%
  map(print_summary)

systems_per_1000000 ~ gdp_per_cap

Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.558310 1.2669645 2.019243 0.0451210
gdp_per_cap 0.000097 0.0000352 2.755332 0.0065391

systems_per_1000000 ~ diseases

Estimate Std. Error t value Pr(>|t|)
(Intercept) 71.0070414 11.4675160 6.192016 0
diseases -0.3132355 0.0544204 -5.755849 0

systems_per_1000000 ~ daly_all

Estimate Std. Error t value Pr(>|t|)
(Intercept) 8.502787 1.6942875 5.018503 0.0000014
daly_all -0.000107 0.0000469 -2.283780 0.0236915

systems_per_1000000 ~ health_exp_tot_usd

Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.6228064 1.1839116 3.9046887 0.0001385
health_exp_tot_usd 0.0002007 0.0003036 0.6611229 0.5094785

systems_per_1000000 ~ gdp_per_cap + diseases

Estimate Std. Error t value Pr(>|t|)
(Intercept) 65.5068853 11.6175397 5.638619 0.0000001
gdp_per_cap 0.0000711 0.0000328 2.169986 0.0314829
diseases -0.2961403 0.0543779 -5.445970 0.0000002

systems_per_1000000 ~ gdp_per_cap * diseases

Estimate Std. Error t value Pr(>|t|)
(Intercept) 44.1765229 18.6570563 2.367818 0.0190957
gdp_per_cap 0.0007259 0.0004503 1.611959 0.1089540
diseases -0.1946181 0.0882350 -2.205678 0.0288412
gdp_per_cap:diseases -0.0000031 0.0000022 -1.457905 0.1468391

systems_per_1000000 ~ gdp_per_cap + diseases + daly_all

Estimate Std. Error t value Pr(>|t|)
(Intercept) 65.5655515 11.6529216 5.6265333 0.0000001
gdp_per_cap 0.0000777 0.0000403 1.9269623 0.0557675
diseases -0.2995635 0.0558591 -5.3628393 0.0000003
daly_all 0.0000155 0.0000546 0.2831212 0.7774522

systems_per_1000000 ~ gdp_per_cap + diseases + health_exp_tot_usd

Estimate Std. Error t value Pr(>|t|)
(Intercept) 55.1216537 14.8520199 3.711391 0.0002846
gdp_per_cap 0.0001663 0.0000910 1.827381 0.0695173
diseases -0.2479501 0.0692845 -3.578726 0.0004580
health_exp_tot_usd -0.0008617 0.0007687 -1.120968 0.2639917

systems_per_1000000 ~ gdp_per_cap + diseases

Estimate Std. Error t value Pr(>|t|)
(Intercept) 65.5068853 11.6175397 5.638619 0.0000001
gdp_per_cap 0.0000711 0.0000328 2.169986 0.0314829
diseases -0.2961403 0.0543779 -5.445970 0.0000002

[[1]] NULL

[[2]] NULL

[[3]] NULL

[[4]] NULL

[[5]] NULL

[[6]] NULL

[[7]] NULL

[[8]] NULL

[[9]] NULL

#'

Without weights = systems

m1 <- glm(systems_per_1000000 ~ gdp_per_cap, data = country_data)
summary(m1)
## 
## Call:
## glm(formula = systems_per_1000000 ~ gdp_per_cap, data = country_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -11.428   -2.603   -1.758   -0.172   65.160  
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2.054e+00  8.019e-01   2.562   0.0113 *  
## gdp_per_cap 1.281e-04  3.137e-05   4.083    7e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 69.44316)
## 
##     Null deviance: 12338  on 162  degrees of freedom
## Residual deviance: 11180  on 161  degrees of freedom
## AIC: 1157.8
## 
## Number of Fisher Scoring iterations: 2
# Significant, of course.

m2 <- glm(systems_per_1000000 ~ diseases, data = country_data)
summary(m2)
## 
## Call:
## glm(formula = systems_per_1000000 ~ diseases, data = country_data)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -7.613  -3.965  -1.698   1.055  60.644  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 61.26451    9.77900   6.265 3.27e-09 ***
## diseases    -0.27220    0.04635  -5.872 2.39e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 63.11445)
## 
##     Null deviance: 12338  on 162  degrees of freedom
## Residual deviance: 10161  on 161  degrees of freedom
## AIC: 1142.2
## 
## Number of Fisher Scoring iterations: 2
# Not significant

m3 <- glm(systems_per_1000000 ~ daly_all, data = country_data)
summary(m3)
## 
## Call:
## glm(formula = systems_per_1000000 ~ daly_all, data = country_data)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -6.051  -3.956  -2.409   0.350  64.364  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.539e+00  1.417e+00   5.320 3.43e-07 ***
## daly_all    -8.777e-05  3.061e-05  -2.868  0.00469 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 72.90864)
## 
##     Null deviance: 12338  on 162  degrees of freedom
## Residual deviance: 11738  on 161  degrees of freedom
## AIC: 1165.7
## 
## Number of Fisher Scoring iterations: 2
# Significant, positive

m4 <- glm(systems_per_1000000 ~ health_exp_tot_usd, data = country_data)
summary(m4)
## 
## Call:
## glm(formula = systems_per_1000000 ~ health_exp_tot_usd, data = country_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -11.943   -2.797   -2.337   -0.438   65.331  
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        2.7246064  0.7771305   3.506  0.00059 ***
## health_exp_tot_usd 0.0010679  0.0003468   3.079  0.00244 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 72.36978)
## 
##     Null deviance: 12338  on 162  degrees of freedom
## Residual deviance: 11652  on 161  degrees of freedom
## AIC: 1164.5
## 
## Number of Fisher Scoring iterations: 2
# Significant, positive


m5 <- lm(systems_per_1000000 ~ gdp_per_cap + diseases , data = country_data)
summary(m5)
## 
## Call:
## lm(formula = systems_per_1000000 ~ gdp_per_cap + diseases, data = country_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -11.811  -3.649  -1.636   1.205  60.806 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.164e+01  1.034e+01   4.994 1.53e-06 ***
## gdp_per_cap  7.895e-05  3.114e-05   2.535   0.0122 *  
## diseases    -2.320e-01  4.826e-02  -4.808 3.49e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.814 on 160 degrees of freedom
## Multiple R-squared:  0.2082, Adjusted R-squared:  0.1983 
## F-statistic: 21.04 on 2 and 160 DF,  p-value: 7.743e-09
# All are still significant.

m6 <- lm(systems_per_1000000 ~ gdp_per_cap * diseases , data = country_data)
summary(m6)
## 
## Call:
## lm(formula = systems_per_1000000 ~ gdp_per_cap * diseases, data = country_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.663  -3.193  -1.381   1.027  60.290 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)   
## (Intercept)           3.209e+01  1.202e+01   2.670  0.00838 **
## gdp_per_cap           1.650e-03  5.256e-04   3.139  0.00202 **
## diseases             -1.380e-01  5.663e-02  -2.436  0.01594 * 
## gdp_per_cap:diseases -7.771e-06  2.596e-06  -2.994  0.00320 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.626 on 159 degrees of freedom
## Multiple R-squared:  0.2505, Adjusted R-squared:  0.2363 
## F-statistic: 17.71 on 3 and 159 DF,  p-value: 5.734e-10
# All are significant here, and the R^2 is better. But conceptually... how should I use weights?

m7 <- lm(systems_per_1000000 ~ gdp_per_cap + diseases + daly_all , data = country_data)
summary(m7)
## 
## Call:
## lm(formula = systems_per_1000000 ~ gdp_per_cap + diseases + daly_all, 
##     data = country_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -11.844  -3.650  -1.658   1.209  60.820 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.166e+01  1.039e+01   4.973 1.69e-06 ***
## gdp_per_cap  7.957e-05  3.541e-05   2.247    0.026 *  
## diseases    -2.324e-01  4.952e-02  -4.694 5.76e-06 ***
## daly_all     1.263e-06  3.389e-05   0.037    0.970    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.838 on 159 degrees of freedom
## Multiple R-squared:  0.2082, Adjusted R-squared:  0.1933 
## F-statistic: 13.94 on 3 and 159 DF,  p-value: 4.104e-08
# DALYs aren't significant here.

m8 <- lm(systems_per_1000000 ~ gdp_per_cap + diseases + health_exp_tot_usd , data = country_data)
summary(m8)
## 
## Call:
## lm(formula = systems_per_1000000 ~ gdp_per_cap + diseases + health_exp_tot_usd, 
##     data = country_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -13.770  -3.560  -1.594   1.330  60.937 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         5.069e+01  1.057e+01   4.796 3.70e-06 ***
## gdp_per_cap         1.089e-04  7.255e-05   1.500    0.135    
## diseases           -2.278e-01  4.928e-02  -4.622 7.83e-06 ***
## health_exp_tot_usd -3.473e-04  7.606e-04  -0.457    0.649    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.833 on 159 degrees of freedom
## Multiple R-squared:  0.2092, Adjusted R-squared:  0.1943 
## F-statistic: 14.02 on 3 and 159 DF,  p-value: 3.709e-08
# Health expenditure isn't significant and it fucks with GDP's significance.

# So, without `weights = systems`, our final model is:
m9 <- lm(systems_per_1000000 ~ gdp_per_cap * diseases , data = country_data)
summary(m9)
## 
## Call:
## lm(formula = systems_per_1000000 ~ gdp_per_cap * diseases, data = country_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.663  -3.193  -1.381   1.027  60.290 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)   
## (Intercept)           3.209e+01  1.202e+01   2.670  0.00838 **
## gdp_per_cap           1.650e-03  5.256e-04   3.139  0.00202 **
## diseases             -1.380e-01  5.663e-02  -2.436  0.01594 * 
## gdp_per_cap:diseases -7.771e-06  2.596e-06  -2.994  0.00320 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.626 on 159 degrees of freedom
## Multiple R-squared:  0.2505, Adjusted R-squared:  0.2363 
## F-statistic: 17.71 on 3 and 159 DF,  p-value: 5.734e-10
models <- list(m1, m2, m3, m4, m5, m6, m7, m8, m9)
models %>%
  map(print_summary)

systems_per_1000000 ~ gdp_per_cap

Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.0541806 0.8019172 2.561587 0.0113373
gdp_per_cap 0.0001281 0.0000314 4.082633 0.0000700

systems_per_1000000 ~ diseases

Estimate Std. Error t value Pr(>|t|)
(Intercept) 61.2645121 9.7790040 6.264903 0
diseases -0.2721971 0.0463531 -5.872245 0

systems_per_1000000 ~ daly_all

Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.5389563 1.4171512 5.319797 0.0000003
daly_all -0.0000878 0.0000306 -2.867577 0.0046903

systems_per_1000000 ~ health_exp_tot_usd

Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.7246064 0.7771305 3.505983 0.0005895
health_exp_tot_usd 0.0010679 0.0003468 3.079452 0.0024390

NULL

Estimate Std. Error t value Pr(>|t|)
(Intercept) 51.638516 10.3405492 4.993788 0.0000015
gdp_per_cap 0.000079 0.0000311 2.535215 0.0121972
diseases -0.232045 0.0482636 -4.807864 0.0000035

NULL

Estimate Std. Error t value Pr(>|t|)
(Intercept) 32.0901888 12.0208877 2.669536 0.0083841
gdp_per_cap 0.0016496 0.0005256 3.138757 0.0020228
diseases -0.1379609 0.0566282 -2.436258 0.0159442
gdp_per_cap:diseases -0.0000078 0.0000026 -2.993542 0.0031985

NULL

Estimate Std. Error t value Pr(>|t|)
(Intercept) 51.6593847 10.3880702 4.9729530 0.0000017
gdp_per_cap 0.0000796 0.0000354 2.2474051 0.0259898
diseases -0.2324329 0.0495206 -4.6936607 0.0000058
daly_all 0.0000013 0.0000339 0.0372729 0.9703141

NULL

Estimate Std. Error t value Pr(>|t|)
(Intercept) 50.6938341 10.5706620 4.7957104 0.0000037
gdp_per_cap 0.0001089 0.0000725 1.5004474 0.1354817
diseases -0.2277647 0.0492832 -4.6215442 0.0000078
health_exp_tot_usd -0.0003473 0.0007606 -0.4566105 0.6485742

NULL

Estimate Std. Error t value Pr(>|t|)
(Intercept) 32.0901888 12.0208877 2.669536 0.0083841
gdp_per_cap 0.0016496 0.0005256 3.138757 0.0020228
diseases -0.1379609 0.0566282 -2.436258 0.0159442
gdp_per_cap:diseases -0.0000078 0.0000026 -2.993542 0.0031985

[[1]] NULL

[[2]] NULL

[[3]] NULL

[[4]] NULL

[[5]] NULL

[[6]] NULL

[[7]] NULL

[[8]] NULL

[[9]] NULL

#'

I should ask Andrew about whether to use weights = systems. I think conceptually it is the right thing to do.